Frequency analysis of letters in books


In [6]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import os
sns.set(style="whitegrid", color_codes=True)

import string

get books

we need some books for the program to read and analyse, a good source for books would be http://www.gutenberg.org/

starting a book reader

first you will need to create a start line so the program will know where to start


In [32]:
#this code is loading the book(in this case boy.txt) and getting the title

with open('boy.txt', encoding='utf-8') as book:
    for i, line in enumerate(book):
        #this is getting the title from the text
        if line.startswith('Title:'):
            title = line[6:]
            print("The title is", title)


The title is  Three Hundred Things a Bright Boy Can Do

Reading the book

this code will read the text and start adding up the letters


In [34]:
with open('boy.txt', encoding='utf-8') as book:
    for i, line in enumerate(book):
        if line.startswith('Title:'):
            title = line[6:]
            print("The title is", title)

        if line.startswith('*** START OF THIS PROJECT'):
            break
            
    letter_frequencies = {}
    for letter in string.ascii_uppercase:
        letter_frequencies[letter] = 0
            
    for i, line in enumerate(book):
        #This starts reading^
        
        '''this code is putting everything to uppercase so the 
        'reader program' can get every single letter'''
        for char in line:
            if char in string.ascii_letters:
                uppercase = char.upper()
                letter_frequencies[uppercase] += 1
                
    print(letter_frequencies)


The title is  Three Hundred Things a Bright Boy Can Do

{'A': 41840, 'B': 10410, 'C': 15605, 'D': 19794, 'E': 66177, 'F': 13732, 'G': 11575, 'H': 31540, 'I': 37303, 'J': 540, 'K': 4602, 'L': 23348, 'M': 11317, 'N': 35946, 'O': 41309, 'P': 11375, 'Q': 959, 'R': 30966, 'S': 33131, 'T': 51717, 'U': 15274, 'V': 4547, 'W': 11190, 'X': 1177, 'Y': 9458, 'Z': 380}

assigning the end of the book

now we have to make the program know where the end of the book is by using the gutenburg books common ending.


In [ ]:
if line.startswith('*** END OF THIS PROJECT'):
                break

adding up and getting percentages

now you can get the add the letters up and get the percentages in which they are used


In [27]:
total = sum(letter_frequencies.values())
             
    letter_percentages = {}
    for letter, count in letter_frequencies.items():
        letter_percentages[letter] = count/total * 100

    data = {
            'letters': list(letter_percentages.keys()),
            'percentages': list(letter_percentages.values()),
        }


---------------------------------------------------------------------------
ZeroDivisionError                         Traceback (most recent call last)
<ipython-input-27-a4f6e5060882> in <module>()
      3 letter_percentages = {}
      4 for letter, count in letter_frequencies.items():
----> 5     letter_percentages[letter] = count/total * 100
      6 
      7 data = {

ZeroDivisionError: division by zero

Plotting the data and analysing

lastly we can plot the data to get a understanding of the percentages of all occuring letters


In [12]:
plt.figure()
        plot = sns.barplot(x='letters',y='percentages', data=data, palette='rainbow_r')
        plot.set_title("{0}\n total {1} characters".format(title, total))
        plot.set_xlabel('Letters')
        plot.set_ylabel('Percentages')


        fmt = '%.0f%%' 
        yticks = mtick.FormatStrFormatter(fmt)
        plot.yaxis.set_major_formatter(yticks)

#   print('The total number of letters is', total)

The code in full

this is what the code should look like when you have put it all together


In [9]:
def word_analysis(filename):

    title = ''

    with open(filename, encoding='utf-8') as book:
        #print(book.read()[:1000])
        for i, line in enumerate(book):
            if line.startswith('Title:'):
                title = line[6:]
            if line.startswith('*** START OF THIS PROJECT'):
                break

        #set up letter counter v   
        letter_frequencies = {}
        for letter in string.ascii_uppercase:
            letter_frequencies[letter] = 0

        for i, line in enumerate(book):
            #This starts reading^
            for char in line:
                if char in string.ascii_letters:
                    uppercase = char.upper()
                    letter_frequencies[uppercase] += 1

            if line.startswith('*** END OF THIS PROJECT'):
                break

#         print (letter_frequencies)

        total = sum(letter_frequencies.values())
             
        letter_percentages = {}
        for letter, count in letter_frequencies.items():
            letter_percentages[letter] = count/total * 100

        data = {
            'letters': list(letter_percentages.keys()),
            'percentages': list(letter_percentages.values()),
        }

        plt.figure()
        plot = sns.barplot(x='letters',y='percentages', data=data, palette='rainbow_r')
        plot.set_title("{0}\n total {1} characters".format(title, total))
        plot.set_xlabel('Letters')
        plot.set_ylabel('Percentages')


        fmt = '%.0f%%' 
        yticks = mtick.FormatStrFormatter(fmt)
        plot.yaxis.set_major_formatter(yticks)

#   print('The total number of letters is', total)
# print(list(os.walk('.')))

In [11]:
for root, dirs, files in os.walk('.'):
    for f in files:
        if f.endswith('.txt'):
            word_analysis(f)
            
    # Stop walking sub directories
    break



In [ ]: